In [1]:
    
%matplotlib inline
    
In [2]:
    
#from __future__ import division
import pandas as pd
import numpy as np
from altair import Chart
    
In [3]:
    
!ls -lah ../data/*csv
    
    
In [4]:
    
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../data/dfa_mp.offset_{}.win_{}.csv'
output = []
for offset in offsets:
    for winsize in winsizes:
        df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
        df['win'] = winsize
        df['offset'] = offset
        output.append(df)
        
dfa = pd.concat(output)
    
In [5]:
    
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa
    
    Out[5]:
In [6]:
    
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['ratio_ATCACG'] > 2)
        & (dfa['offset'] == 200)
        & (dfa['win'] == 80)][['UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
    
In [7]:
    
d.shape
    
    Out[7]:
In [18]:
    
d
    
    Out[18]:
In [28]:
    
from copy import deepcopy
import statsmodels.api as sm
import altair
def linear_regression(x, y):
    p = np.polyfit(x, y, 1)
    return np.polyval(p, x)
def lowess(x, y):
    return sm.nonparametric.lowess(y, x, frac=1/7, return_sorted=False)
def rmean(x, y):
    win = y.shape[0] // 20
    return y.rolling(center=True, window=win).mean()
class RegressionChart(altair.Chart):
    @staticmethod
    def _add_regression_column(group, regression_func, x, y, yfit):
        group[yfit] = regression_func(group[x], group[y])
        return group
    
    def regression_plot(self, func=linear_regression, **kwargs):
        if not isinstance(self.data, pd.DataFrame):
            raise ValueError("data must be a DataFrame")
            
        points = self.mark_point()
        lines = deepcopy(self).mark_line()
        
        encoding = points.encoding.to_dict()
        if any(enc.get('bin', False) for enc in encoding.values()):
            raise ValueError("regress() cannot handle binned variables")
            
        group_cols = [enc['field'] for key,enc in encoding.items()
                     if key not in ['x', 'y']]
        x = encoding['x']['field']
        y = encoding['y']['field']
        yfit = y + '_fit'
        lines.encode(y=yfit)
        if group_cols:
            groups = self.data.groupby(group_cols)
            data = groups.apply(self._add_regression_column, regression_func=func,
                               x=x, y=y, yfit=yfit)
        else:
            data = self._add_regression_column(self.data.copy(),
                                               regression_func=func,
                                               x=x, y=y, yfit=yfit)
            
        return altair.LayeredChart(data).set_layers(points, lines)
    
In [30]:
    
from altair import X, Y, Scale
RegressionChart(d).mark_circle().encode(
    X('loglen:Q', scale=Scale(domain=(1.6, 3))), 
    y='log-bcm'
).regression_plot(func=lowess)
    
    
    
    
In [17]:
    
from altair import X, Y, Scale
RegressionChart(d).mark_circle().encode(
    X('loglen:Q', scale=Scale(domain=(1.6, 3))), 
    y='log-bcm'
).regression_plot(func=linear_regression)
    
    
    
    
In [ ]: